"""
@FileName：且行且歌.py
@Description：

@Author：HeYiQing
@Time：2023/11/26 21:48
"""
import openpyxl
import requests
import re

# title article
if __name__ == '__main__':
    # UA伪装
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
    }
    # 获取excel表
    workbook = openpyxl.load_workbook('../../repo/dili360/excel/且行且歌.xlsx')
    sheet = workbook['title_url']

    #存储article
    all_title_article = {}

    title_pattern = re.compile(r'<h1>(.*?)<br>', re.DOTALL)
    pattern = re.compile(r'<article>(.*?)</article>', re.DOTALL)
    #依次获取url
    column_url = [cell.value for cell in sheet['B']]
    count = 1
    for i, url in enumerate(column_url):
        if i >= 1:
            page_text = requests.get(url=url, headers=headers).text
            match = pattern.search(page_text)
            if match:
                result = match.group(1)
                title = title_pattern.findall(result)
                # 在每个img标签中添加style属性
                img_pattern = re.compile(r'<img(.*?)>')
                result = img_pattern.sub(r'<img\1 style="width:100%">', result)
                article = "<article>" + result + "</article>"
                all_title_article.update({title[0]: article})
                print(count)
                count += 1
            else:
                None
    print('加载完成共' + str(count) + '篇文章')
    article_sheet = workbook.create_sheet(title='title_article')
    article_sheet['A1'] = 'title'
    article_sheet['B1'] = 'article'
    row = 2
    for a_title,a_article in all_title_article.items():
        article_sheet.cell(row, 1).value = a_title
        article_sheet.cell(row,2).value = a_article
        row += 1
        print(a_title + '-----下载完成')
    workbook.save('../../repo/dili360/excel/且行且歌.xlsx')
    print('~~~~~title-article---全部下载完成~~~~~')
